library(ggplot2)
library(stringi)
library(gridExtra)
library(dendextend)
library(kableExtra)
library(limma)
library(psych)
library(tidyverse)
library(CONSTANd) # install from source: https://github.com/PDiracDelta/CONSTANd/
source('./other_functions.R')
source('./plotting_functions.R')
This notebook presents isobaric labeling data analysis strategy that includes data-driven normalization.
In other notebooks in this series we have systematically varied components and observed how they affect the outcome of a DEA analysis. We have seen that medianSweeping normalization works does not work well for intensities on the original scale, and that CONSTANd does not work well on log2-transformed intensities. Here we compare medianSweeping on log2 scale, which we know does a good job, with CONSTANd on original intensity scale.
data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character
#TEMP -remove!
# tmp=dat.l %>% distinct(Protein) %>% pull %>% as.character
# dat.l <- dat.l %>% filter(Protein %in% c(tmp[sample(1:length(tmp), size=20)], spiked.proteins))
# specify # of varying component variants and their names
variant.names <- c('medianSweeping', 'CONSTANd')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'raw')
# pick reference condition for making plots / doing DEA
referenceCondition <- '0.5'
# specify colours corresponding to biological conditions
condition.color <- tribble(
~Condition, ~Color,
"0.125", 'black',
"0.5", 'blue',
"0.667", 'green',
"1", 'red' )
# create data frame with sample info (distinct Run,Channel, Sample, Condition, Color)
sample.info <- get_sample_info(dat.l, condition.color)
channelNames <- remove_factors(unique(sample.info$Channel))
dat.unit.l <- emptyList(variant.names)
dat.unit.l$medianSweeping <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)
dat.unit.l$CONSTANd <- dat.l %>% rename(response=Intensity)
CONSTANd vs medianSweeping (in 2 steps)
# switch to wide format
dat.unit.w <- lapply(dat.unit.l, function(x) {
pivot_wider(data = x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response)
})
dat.norm.w <- emptyList(names(dat.unit.w))
# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w$medianSweeping <- dat.unit.w$medianSweeping
dat.norm.w$medianSweeping[,channelNames] <- dat.norm.w$medianSweeping[,channelNames] %>% sweep(1, apply(.[,channelNames], 1, median, na.rm=T))
dat.norm.w$medianSweeping
## # A tibble: 94,698 x 23
## Mixture TechRepMixture Run Protein Peptide RT Charge PTM
## <fct> <fct> <fct> <fct> <fct> <dbl> <fct> <fct>
## 1 Mixtur… 1 Mixt… Q9NQC3 [R].hQ… 101. 4 N-Te…
## 2 Mixtur… 1 Mixt… P05141 [K].dF… 167. 3 N-Te…
## 3 Mixtur… 1 Mixt… P49189 [K].tV… 183. 2 N-Te…
## 4 Mixtur… 1 Mixt… P62263 [R].iE… 133. 2 N-Te…
## 5 Mixtur… 1 Mixt… P21291 [K].gF… 145. 2 N-Te…
## 6 Mixtur… 1 Mixt… P30511 [K].wA… 137. 2 N-Te…
## 7 Mixtur… 1 Mixt… Q13185 [R].lT… 106. 2 N-Te…
## 8 Mixtur… 1 Mixt… P02787… [K].sA… 149. 3 N-Te…
## 9 Mixtur… 1 Mixt… Q9UBQ5 [K].iD… 202. 2 N-Te…
## 10 Mixtur… 1 Mixt… Q5H9R7 [R].tG… 107. 2 N-Te…
## # … with 94,688 more rows, and 15 more variables: TotalIntensity <dbl>,
## # Ions.Score <int>, DeltaMZ <dbl>, isoInterOk <lgl>, noNAs <lgl>,
## # onehit.protein <lgl>, shared.peptide <lgl>, `127C` <dbl>, `127N` <dbl>,
## # `128C` <dbl>, `128N` <dbl>, `129C` <dbl>, `129N` <dbl>, `130C` <dbl>,
## # `130N` <dbl>
Now let’s apply CONSTANd to each Run separately, and then combine the results into a semi-wide dataframe again.
# dat.unit.l entries are in long format so all have same colnames and no channelNames
x.split <- split(dat.unit.w$CONSTANd, dat.unit.w$CONSTANd$Run) # apply CONSTANd to each Run separately
x.split.norm <- lapply(x.split, function(y) {
y[,channelNames] <- CONSTANd(y[,channelNames])$normalized_data
return(y)
})
dat.norm.w$CONSTANd <- bind_rows(x.split.norm)
dat.norm.w$CONSTANd
## # A tibble: 94,698 x 23
## Mixture TechRepMixture Run Protein Peptide RT Charge PTM
## <fct> <fct> <fct> <fct> <fct> <dbl> <fct> <fct>
## 1 Mixtur… 1 Mixt… Q99447 [K].rT… 61.6 3 N-Te…
## 2 Mixtur… 1 Mixt… Q86X55 [R].lL… 142. 2 N-Te…
## 3 Mixtur… 1 Mixt… Q96A33 [K].iM… 115. 2 N-Te…
## 4 Mixtur… 1 Mixt… Q9Y3D7 [K].sV… 129. 2 N-Te…
## 5 Mixtur… 1 Mixt… P18085 [R].iQ… 146. 3 N-Te…
## 6 Mixtur… 1 Mixt… P14868 [R].vT… 160. 3 N-Te…
## 7 Mixtur… 1 Mixt… Q13813 [K].dL… 119. 2 N-Te…
## 8 Mixtur… 1 Mixt… P49736 [R].gL… 177. 3 N-Te…
## 9 Mixtur… 1 Mixt… Q96T88 [K].lW… 196. 3 N-Te…
## 10 Mixtur… 1 Mixt… P20700 [K].sM… 96.2 2 N-Te…
## # … with 94,688 more rows, and 15 more variables: TotalIntensity <dbl>,
## # Ions.Score <int>, DeltaMZ <dbl>, isoInterOk <lgl>, noNAs <lgl>,
## # onehit.protein <lgl>, shared.peptide <lgl>, `127C` <dbl>, `127N` <dbl>,
## # `128C` <dbl>, `128N` <dbl>, `129C` <dbl>, `129N` <dbl>, `130C` <dbl>,
## # `130N` <dbl>
Summarize quantification values from PSM to peptide (first step) to protein (second step).
# normalized data
# group by (run,)protein,peptide then summarize twice (once on each level)
dat.norm.summ.w <- lapply(dat.norm.w, function(x) x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup() )
Notice that the row sums are not equal to Ncols anymore, because the median summarization does not preserve them (but mean summarization does).
# medianSweeping: in each channel, subtract median computed across all proteins within the channel
# do the above separately for each MS run
x.split <- split(dat.norm.summ.w$medianSweeping, dat.norm.summ.w$medianSweeping$Run)
x.split.norm <- lapply( x.split, function(y) {
y[,channelNames] <- sweep(y[,channelNames], 2, apply(y[,channelNames], 2, median, na.rm=T) )
return(y) } )
dat.norm.summ.w$medianSweeping <- bind_rows(x.split.norm)
# make data completely wide (also across runs)
dat.norm.summ.w2 <- lapply( dat.norm.summ.w, function(x) x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )
# use (half-)wide format
par(mfrow=c(1,2))
for (i in seq_along(variant.names)) {
boxplot_w(dat.norm.summ.w[[variant.names[i]]], sample.info, paste('Normalized', variant.names[i], sep='_'))}
MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).
# use wide2 format
p <- emptyList(variant.names)
for (i in 1: n.comp.variants){
p[[i]] <- maplot_ils(dat.norm.summ.w2[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Normalized', variant.names[i], sep='_'), spiked.proteins)}
grid.arrange(p[[1]], p[[2]], ncol=2)
MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).
channels.num <- sample.info %>% filter(Condition=='1') %>% select(Sample) %>% pull
channels.denom <- sample.info %>% filter(Condition=='0.125') %>% select(Sample) %>% pull
p <- emptyList(variant.names)
for (i in 1:n.comp.variants){
p[[i]] <- maplot_ils(dat.norm.summ.w2[[i]], channels.num, channels.denom, scale=scale.vec[i], paste('Normalized', variant.names[i], sep='_'), spiked.proteins)}
grid.arrange(p[[1]], p[[2]], ncol=2)
par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))}
par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))}
Only use spiked proteins
par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
dendrogram_ils(dat.norm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), info=sample.info, paste('Normalized', variant.names[i], sep='_'))}
NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).
design.matrix <- get_design_matrix(referenceCondition, sample.info)
dat.dea <- emptyList(names(dat.norm.summ.w2))
for (i in seq_along(dat.norm.summ.w2)) {
this_scale <- scale.vec[match(names(dat.dea)[i], variant.names)]
d <- column_to_rownames(as.data.frame(dat.norm.summ.w2[[variant.names[i]]]), 'Protein')
dat.dea[[variant.names[i]]] <- moderated_ttest(dat=d, design.matrix, scale=this_scale)}
cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm, referenceCondition)
| background | spiked | background | spiked | |
|---|---|---|---|---|
| not_DE | 4732 | 4 | 4735 | 5 |
| DE | 6 | 17 | 3 | 16 |
| medianSweeping | CONSTANd | |
|---|---|---|
| Accuracy | 0.998 | 0.998 |
| Sensitivity | 0.810 | 0.762 |
| Specificity | 0.999 | 0.999 |
| PPV | 0.739 | 0.842 |
| NPV | 0.999 | 0.999 |
| background | spiked | background | spiked | |
|---|---|---|---|---|
| not_DE | 4736 | 17 | 4736 | 15 |
| DE | 2 | 4 | 2 | 6 |
| medianSweeping | CONSTANd | |
|---|---|---|
| Accuracy | 0.996 | 0.996 |
| Sensitivity | 0.190 | 0.286 |
| Specificity | 1.000 | 1.000 |
| PPV | 0.667 | 0.750 |
| NPV | 0.996 | 0.997 |
| background | spiked | background | spiked | |
|---|---|---|---|---|
| not_DE | 4732 | 5 | 4731 | 3 |
| DE | 2 | 16 | 3 | 18 |
| medianSweeping | CONSTANd | |
|---|---|---|
| Accuracy | 0.999 | 0.999 |
| Sensitivity | 0.762 | 0.857 |
| Specificity | 1.000 | 0.999 |
| PPV | 0.889 | 0.857 |
| NPV | 0.999 | 0.999 |
# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
significance.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)
scatterplot_ils(dat.dea, significance.cols, 'q-values', spiked.proteins)
scatterplot_ils(dat.dea, logFC.cols, 'log2FC', spiked.proteins)
for (i in 1:n.contrasts){
volcanoplot_ils(dat.dea, i, spiked.proteins)}
Let’s see whether the spiked protein fold changes make sense
# plot theoretical value (horizontal lines) and violin per variant
violinplot_ils(lapply(dat.dea, function(x) x[spiked.proteins, logFC.cols]), referenceCondition)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=de_BE.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=de_BE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=de_BE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_BE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] CONSTANd_0.99.4 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2
## [5] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 tibble_3.0.4
## [9] tidyverse_1.3.0 psych_2.0.9 limma_3.45.19 kableExtra_1.3.1
## [13] dendextend_1.14.0 gridExtra_2.3 stringi_1.5.3 ggplot2_3.3.2
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-150 fs_1.5.0 lubridate_1.7.9
## [4] webshot_0.5.2 httr_1.4.2 tools_4.0.3
## [7] backports_1.1.10 utf8_1.1.4 R6_2.4.1
## [10] rpart_4.1-15 DBI_1.1.0 mgcv_1.8-33
## [13] colorspace_1.4-1 nnet_7.3-14 withr_2.3.0
## [16] tidyselect_1.1.0 mnormt_2.0.2 compiler_4.0.3
## [19] cli_2.1.0 rvest_0.3.6 xml2_1.3.2
## [22] labeling_0.4.2 scales_1.1.1 digest_0.6.27
## [25] rmarkdown_2.5 pkgconfig_2.0.3 htmltools_0.5.0
## [28] highr_0.8 dbplyr_1.4.4 rlang_0.4.8
## [31] readxl_1.3.1 rstudioapi_0.11 farver_2.0.3
## [34] generics_0.0.2 jsonlite_1.7.1 ModelMetrics_1.2.2.2
## [37] magrittr_1.5 Matrix_1.2-18 Rcpp_1.0.5
## [40] munsell_0.5.0 fansi_0.4.1 viridis_0.5.1
## [43] lifecycle_0.2.0 pROC_1.16.2 yaml_2.2.1
## [46] MASS_7.3-53 plyr_1.8.6 recipes_0.1.14
## [49] grid_4.0.3 blob_1.2.1 parallel_4.0.3
## [52] crayon_1.3.4 lattice_0.20-41 haven_2.3.1
## [55] splines_4.0.3 hms_0.5.3 tmvnsim_1.0-2
## [58] knitr_1.30 pillar_1.4.6 stats4_4.0.3
## [61] reshape2_1.4.4 codetools_0.2-16 reprex_0.3.0
## [64] glue_1.4.2 evaluate_0.14 data.table_1.13.2
## [67] modelr_0.1.8 vctrs_0.3.4 foreach_1.5.1
## [70] cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1
## [73] xfun_0.18 gower_0.2.2 prodlim_2019.11.13
## [76] broom_0.7.2 e1071_1.7-4 class_7.3-17
## [79] survival_3.2-7 viridisLite_0.3.0 timeDate_3043.102
## [82] iterators_1.0.13 lava_1.6.8 ellipsis_0.3.1
## [85] caret_6.0-86 ipred_0.9-9